df <- as.data.frame(read.table('./acea-water-prediction/Aquifer_Petrignano.csv', header = TRUE, sep=','))
head(df, n=20)
## п.їDate Rainfall_Bastia_Umbra Depth_to_Groundwater_P24
## 1 14/03/2006 NA -22.48
## 2 15/03/2006 NA -22.38
## 3 16/03/2006 NA -22.25
## 4 17/03/2006 NA -22.38
## 5 18/03/2006 NA -22.60
## 6 19/03/2006 NA -22.35
## 7 20/03/2006 NA -22.50
## 8 21/03/2006 NA -22.31
## 9 22/03/2006 NA -22.31
## 10 23/03/2006 NA -22.40
## 11 24/03/2006 NA -22.32
## 12 25/03/2006 NA -22.25
## 13 26/03/2006 NA -22.15
## 14 27/03/2006 NA -22.47
## 15 28/03/2006 NA -22.27
## 16 29/03/2006 NA -22.52
## 17 30/03/2006 NA -22.50
## 18 31/03/2006 NA -22.70
## 19 01/04/2006 NA -22.77
## 20 02/04/2006 NA -22.49
## Depth_to_Groundwater_P25 Temperature_Bastia_Umbra Temperature_Petrignano
## 1 -22.18 NA NA
## 2 -22.14 NA NA
## 3 -22.04 NA NA
## 4 -22.04 NA NA
## 5 -22.04 NA NA
## 6 -21.95 NA NA
## 7 -21.99 NA NA
## 8 -21.89 NA NA
## 9 -21.82 NA NA
## 10 -21.89 NA NA
## 11 -21.89 NA NA
## 12 -21.82 NA NA
## 13 -21.79 NA NA
## 14 -21.83 NA NA
## 15 -21.75 NA NA
## 16 -21.81 NA NA
## 17 -21.84 NA NA
## 18 -21.87 NA NA
## 19 -21.90 NA NA
## 20 -21.83 NA NA
## Volume_C10_Petrignano Hydrometry_Fiume_Chiascio_Petrignano
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 8 NA NA
## 9 NA NA
## 10 NA NA
## 11 NA NA
## 12 NA NA
## 13 NA NA
## 14 NA NA
## 15 NA NA
## 16 NA NA
## 17 NA NA
## 18 NA NA
## 19 NA NA
## 20 NA NA
Remove NA elements from data
df <- df[complete.cases(df$Rainfall_Bastia_Umbra),]
rownames(df) <- 1:nrow(df)
df <- subset(df, select = -c(Depth_to_Groundwater_P25, Temperature_Petrignano ))
colnames(df) <- c("Date", "Rainfall", "Depth_to_Groundwater","Temperature", " Volume", " Hydrometry" )
head(df, n=20)
## Date Rainfall Depth_to_Groundwater Temperature Volume Hydrometry
## 1 01/01/2009 0.0 -31.96 5.2 -24530.69 2.4
## 2 02/01/2009 0.0 -32.03 2.3 -28785.89 2.5
## 3 03/01/2009 0.0 -31.97 4.4 -25766.21 2.4
## 4 04/01/2009 0.0 -31.91 0.8 -27919.30 2.4
## 5 05/01/2009 0.0 -31.94 -1.9 -29854.66 2.3
## 6 06/01/2009 0.0 -31.89 -0.7 -29124.58 2.3
## 7 07/01/2009 0.0 -31.91 1.5 -31173.12 2.3
## 8 08/01/2009 0.0 -31.83 4.3 -30232.22 2.4
## 9 09/01/2009 0.9 -31.80 4.9 -30597.70 2.3
## 10 10/01/2009 0.0 -31.76 1.9 -31337.28 2.3
## 11 11/01/2009 0.0 -31.70 3.4 -29845.15 2.3
## 12 12/01/2009 0.0 -31.57 3.3 -28745.28 2.3
## 13 13/01/2009 1.1 -31.54 6.0 -28932.77 2.3
## 14 14/01/2009 0.0 -31.44 8.2 -28600.13 2.3
## 15 15/01/2009 0.0 -31.26 7.8 -24973.06 2.3
## 16 16/01/2009 0.0 -31.50 5.1 -31388.26 2.3
## 17 17/01/2009 0.0 -31.61 1.6 -30941.57 2.3
## 18 18/01/2009 0.1 -31.15 6.4 -23043.74 2.3
## 19 19/01/2009 0.1 -30.95 10.3 -21658.75 2.3
## 20 20/01/2009 0.0 -30.93 12.1 -23793.70 2.3
Plot data
library('ggplot2')
library('forecast')
library('zoo')
library('dplyr')
library('data.table')
library('imputeTS')
library('xts')
library('tseries')
library('stats')
library('nlme')
library('fpp')
library('lubridate')
library('TSstudio')
library('AICcmodavg')
# sort by date
df$Date <- as.Date(df$Date, format= "%d/%m/%Y")
df <- df[order(df$Date), ]
interval = df$Date - shift(df$Date, n=1, fill=NA, type="lag")
for(i in interval)
{
if(isTRUE(i > 1) || is.null(i))
{
print(i)
}
}
As we can see, there are some missing values that we need to fix. We can see it from stat and from graphics where red places or outliers.
# find that there are some missing values in data set
ggplot_na_distribution(df$Depth_to_Groundwater)
ggplot_na_distribution(df$Rainfall)
ggplot_na_distribution(df$Temperature)
ggplot_na_distribution(df$` Volume`)
ggplot_na_distribution(df$` Hydrometry`)
statsNA(df$Depth_to_Groundwater)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 39
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.929%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 4
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 9.75
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (1050 values from 1 to 1050) : 0 NAs (0%)"
## [1] " Bin 2 (1050 values from 1051 to 2100) : 28 NAs (2.67%)"
## [1] " Bin 3 (1050 values from 2101 to 3150) : 0 NAs (0%)"
## [1] " Bin 4 (1049 values from 3151 to 4199) : 11 NAs (1.05%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "21 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "7 NA in a row (occuring 2 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "21 NA in a row (occuring 1 times, making up for overall 21 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 4 NA in a row: 1 times"
## [1] " 7 NA in a row: 2 times"
## [1] " 21 NA in a row: 1 times"
statsNA(df$Rainfall)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 0
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 0
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 0
## [1] "-------------------------"
## [1] "No NAs in the time series."
## [1] "-------------------------"
## [1] "There are no NAs in the time series"
statsNA(df$Temperature)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 0
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 0
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 0
## [1] "-------------------------"
## [1] "No NAs in the time series."
## [1] "-------------------------"
## [1] "There are no NAs in the time series"
statsNA(df$` Volume`)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 1
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.0238%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 1
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 1
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (1050 values from 1 to 1050) : 0 NAs (0%)"
## [1] " Bin 2 (1050 values from 1051 to 2100) : 0 NAs (0%)"
## [1] " Bin 3 (1050 values from 2101 to 3150) : 0 NAs (0%)"
## [1] " Bin 4 (1049 values from 3151 to 4199) : 1 NAs (0.0953%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "1 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 1 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "1 NA in a row (occuring 1 times, making up for overall 1 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 1 times"
statsNA(df$` Hydrometry`)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 0
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 0
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 0
## [1] "-------------------------"
## [1] "No NAs in the time series."
## [1] "-------------------------"
## [1] "There are no NAs in the time series"
So we need to fix data
times_dd <- ts(df$Depth_to_Groundwater, start = df$Date[1], frequency=1)
ggplot_na_distribution(times_dd)
times_dd <- na_interpolation(times_dd, option="spline")
After interpolation
ggplot_na_distribution(times_dd)
plot( times_dd,xlab = "Time", ylab = "Depth_to_Groundwater", main="Depth", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
Volume has some troubles
df$` Volume` <- ifelse(df$` Volume` == 0, NaN, df$` Volume`)
times_vv <- ts(df$` Volume`, start = df$Date[1], frequency=1)
ggplot_na_distribution(times_vv)
times_vv <- na_interpolation(times_vv, option="linear")
ggplot_na_distribution(times_vv)
plot( times_vv,xlab = "Time", ylab = "Volume", main="Volume", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
Interpolation can’t be realised due to there are no missing values, there are only 0 values, that are anomaly for for current data. So we can change 0 to nan and then interpolate
df$` Hydrometry` <- ifelse(df$` Hydrometry` == 0, NaN, df$` Hydrometry`)
times_hh <- ts(df$` Hydrometry`, start = df$Date[1], frequency=1)
ggplot_na_distribution(times_hh)
times_hh <- na_interpolation(times_hh, option="linear")
ggplot_na_distribution(times_hh)
plot( times_hh,xlab = "Time", ylab = "Hydrometry", main="Hydrometry", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
the series can be assembled unevenly over time and to eliminate this resampling is used So if data was collected once an hour, and then suddenly once a day, and then once every 3 hours we will make it at equal intervals. But there we haven’t got any troubles that was checd by counting interval higher
times_dd <- ts(df$Depth_to_Groundwater, start = df$Date[1], frequency=1)
times_dd <- na_interpolation(times_dd, option="spline")
time_dd_week <- period.apply(as.xts(times_dd), endpoints(as.xts(times_dd), "weeks"), range)
times_dd <- na_interpolation(time_dd_week, option="spline")
plot( time_dd_week,xlab = "Time", ylab = "Depth_to_Groundwater", main="Depth", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
times_t <- ts(df$Temperature, start = df$Date[1], frequency=1)
times_tt_week <- period.apply(as.xts(times_t), endpoints(as.xts(times_t), "weeks"), range)
plot( times_tt_week,xlab = "Time", ylab = "Temperature", main="Temperature", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
times_vv <- ts(df$` Volume`, start = df$Date[1], frequency=1)
times_vv_week <- period.apply(as.xts(times_vv), endpoints(as.xts(times_vv), "weeks"), range)
times_vv_week <- na_interpolation(times_vv_week, option="linear")
plot( times_vv_week,xlab = "Time", ylab = "Volume", main="Volume", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
times_hh <- ts(df$` Hydrometry`, start = df$Date[1], frequency=1)
times_hh_week <- period.apply(as.xts(times_hh), endpoints(as.xts(times_hh), "weeks"), range)
times_hh_week <- na_interpolation(times_hh_week, option="linear")
plot( times_hh_week,xlab = "Time", ylab = "Hydrometry", main="Hydrometry", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
times <- ts(df$Rainfall, start = df$Date[1], frequency=1)
times_week <- period.apply(as.xts(times), endpoints(as.xts(times), "weeks"), range)
plot( times_week,xlab = "Time", ylab = "Rainfall", main="Rainfall", axes=FALSE)
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()
We can say that it is not obvious to resample data, we can make predictions for one day and it will be quite accurate and full. If we will take for day or month we can lose some info that will be critical for people’ life. Also there is no such big outliers that require smoothing
#depth <- diff(depth)
depth <- na_interpolation(df$Depth_to_Groundwater, option = "linear")
depth_ts <- ts(depth, start = df$Date[1], frequency=365)
look at seasonal, because we can make some decisions from the seasonalty.
ts_seasonal(ts(na_interpolation(df$Depth_to_Groundwater, option = "linear"), start = df$Date[1], frequency=12), type = "all")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
ts_plot(depth_ts)
rainfall <- na_interpolation(df$Rainfall, option = "linear")
rainfall_ts <- ts(rainfall, start = df$Date[1], frequency=12)
rainfall_ts <- diff(rainfall_ts)
ts_seasonal(rainfall_ts, type = "all")
Rainfall ???
temperature <- na_interpolation(df$Temperature, option = "linear")
temperature_ts <- ts(temperature, start = df$Date[1], frequency=12)
temperature_ts <- diff(temperature_ts)
ts_seasonal(temperature_ts, type = "all")
From the plot we can see that the maximum temperature was in August, minimum in December
volume <- na_interpolation(df$` Volume`, option = "linear")
volume_ts <- ts(volume, start = df$Date[1], frequency=12)
volume_ts <- diff(volume_ts)
ts_seasonal(volume_ts, type = "all")
Maximum volume was in March, minimum in December
hydrometry <- na_interpolation(df$` Hydrometry`, option = "linear")
hydrometry_ts <- ts(hydrometry, start = df$Date[1], frequency=12)
hydrometry_ts <- diff(hydrometry_ts)
ts_seasonal(hydrometry_ts, type = "all")
Maximum in August, minimum in January.
The volume and hydrometry reached their minimum around the same time